This metric is the share of people who are poor in a county who live in census tracts with poverty rates over 40%. If a county’s overall poverty rate is 20% but people in poverty are spread out evenly across all census tracts, the index would equal 0; if they were heavily concentrated in certain tracts, the index would approach 1.
Process:
The most recent version of this metric uses ACS 2017-2021 5-Year
Data, released on 2022-12-08.
All numbers come from the Census API. The documentation for the Census
API is available here.
We pull all of the race/ethnicity counts for 2021 using
library(censusapi). Note: This will
require a Census
API key. Add the key to census_api_key-template.R and
then delete “template”. It is sourced above.
To do this we have to identify census tracts with poverty rates over 40% in each county, count the number of residents in those tracts who are poor, sum that up and divide it by the total number of poor residents in the county.
options(scipen = 999)
library(tidyverse)
library(censusapi)
library(urbnthemes)
library(reactable)
set_urbn_defaults(style = "print")
source(here::here("06_neighborhoods", "R", "census_api_key.R"))
source(here::here("06_neighborhoods", "R", "get_vars.R"))
https://api.census.gov/data/2021/acs/acs5/variables.html
vars <- c( # Estimate!!Total!!Income in the past 12 months below poverty level
# "B00001_001E", # UNWEIGHTED SAMPLE COUNT OF THE POPULATION
# "B01001_001E", # SEX BY AGE
"B17001_001E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (Total)
"B17001_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE
"B17001_002M",
# "B17001A_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)
# "B17001A_002M",
"B17001B_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
"B17001B_002M",
"B17001C_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
"B17001C_002M",
"B17001D_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)
"B17001D_002M",
"B17001E_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
"B17001E_002M",
"B17001F_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)
"B17001F_002M",
"B17001G_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)
"B17001G_002M",
"B17001H_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE, NOT HISPANIC OR LATINO)
"B17001H_002M",
"B17001I_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)
"B17001I_002M"
)
# pull Census tracts for 2021
state_fips <- paste0("state:", unique(urbnmapr::states$state_fips))
tracts <- map_df(state_fips, ~getCensus(name = "acs/acs5",
vars = vars,
region = "tract:*",
regionin = .x,
vintage = 2021)) %>%
as_tibble()
# rename the variables
tracts <- tracts %>%
rename(
people = B17001_001E,
poverty = B17001_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE
poverty_moe = B17001_002M,
#poverty_white = B17001A_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)
#poverty_white_moe = B17001A_002M,
poverty_black = B17001B_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
poverty_black_moe = B17001B_002M,
poverty_aian = B17001C_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
poverty_aian_moe = B17001C_002M,
poverty_asian = B17001D_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)
poverty_asian_moe = B17001D_002M,
poverty_pacific = B17001E_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
poverty_pacific_moe = B17001E_002M,
poverty_other = B17001F_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)
poverty_other_moe = B17001F_002M,
poverty_twoplus = B17001G_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)
poverty_twoplus_moe = B17001G_002M,
poverty_white_nonhispanic = B17001H_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE, NOT HISPANIC OR LATINO)
poverty_white_nonhispanic_moe = B17001H_002M,
poverty_hispanic = B17001I_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)
poverty_hispanic_moe = B17001I_002M
)
Some tracts don’t have any population. We drop those tracts.
tracts <- tracts %>%
tidylog::filter(people > 0)
There was a data collection error in Rio Arriba County, NM that affected 2018 ACS estimates (source). We dropped these observations in 2018, but the error does not apply to the 2021 5-year file, so we do not drop these observations. Check the number of people. It should be around 321,897,703.
tracts %>%
summarize(sum(people))
## # A tibble: 1 × 1
## `sum(people)`
## <dbl>
## 1 321897703
We need to combine the small groups into a group for other races and ethnicities. The Census Bureau typically only posts cross tabs for up to two variables. This requires race, ethnicity, and poverty status so the resulting groups are not disjoint.
knitr::include_graphics(here::here("06_neighborhoods", "www", "images", "race.png"))
Combine the race/ethnicity groups into the group of interest.
tracts <- tracts %>%
mutate(
poverty_other_races =
poverty_aian +
poverty_asian +
poverty_pacific +
poverty_other +
poverty_twoplus
) #%>%
#select(-poverty_aian, -poverty_asian, -poverty_pacific, -poverty_other, -poverty_twoplus)
This Census presentation recommends using the maximum margin of error when aggregating multiple zero estimates.
One way this approximation can differ from the actual MOE is if you were aggregating multiple zero estimates. In this case, the approximate MOE could diverge from the actual margin of error. And so the - our recommendation is to only include one zero estimate margin of error and include the largest one.
# pivot the point estimates
values <- tracts %>%
select(state,
county,
tract,
poverty_aian,
poverty_asian,
poverty_pacific,
poverty_other,
poverty_twoplus) %>%
pivot_longer(c(-state, -county, -tract), names_to = "group", values_to = "value")
# pivot the margins of error
moes <- tracts %>%
select(state,
county,
tract,
poverty_aian_moe,
poverty_asian_moe,
poverty_pacific_moe,
poverty_other_moe,
poverty_twoplus_moe) %>%
pivot_longer(c(-state, -county, -tract), names_to = "group", values_to = "moe") %>%
mutate(group = str_replace(group, "_moe", ""))
# combine the point estimates and margins of error
other_moe <- left_join(values, moes, by = c("state", "county", "tract", "group"))
rm(moes, values)
# keep MOE for non-zero estimates and keep the largest MOE for zero estimates
other_moe <- other_moe %>%
group_by(state, county, tract) %>%
mutate(moe = if_else(value == 0, max(moe), moe))
# combine the margins of error
other_moe <- other_moe %>%
summarize(poverty_other_races_moe = sqrt(sum(moe ^ 2))) %>%
ungroup()
# append to the original data set
tracts <- left_join(tracts, other_moe, by = c("state", "county", "tract"))
We convert margins of error to standard errors using 1.645 as the critical value (page 3)
tracts <- tracts %>%
mutate(
poverty_se = poverty_moe / 1.645,
poverty_black_se = poverty_black_moe / 1.645,
poverty_hispanic_se = poverty_hispanic_moe / 1.645,
poverty_other_races_se = poverty_other_races_moe / 1.645,
poverty_white_nonhispanic_se = poverty_white_nonhispanic_moe / 1.645
)
Drop all of the extra variables.
tracts <- tracts %>%
select(
state,
county,
tract,
people,
poverty,
poverty_black,
poverty_hispanic,
poverty_other_races,
poverty_white_nonhispanic,
poverty_se,
poverty_black_se,
poverty_hispanic_se,
poverty_other_races_se,
poverty_white_nonhispanic_se,
poverty_moe,
poverty_black_moe,
poverty_hispanic_moe,
poverty_other_races_moe,
poverty_white_nonhispanic_moe
)
Look at the margins of error. A large share of the Other Races and Ethnicities have coefficients of variation greater than 0.4.
tracts %>%
summarize(mean(poverty_other_races_se / poverty_other_races > 0.4))
## # A tibble: 1 × 1
## `mean(poverty_other_races_se/poverty_other_races > 0.4)`
## <dbl>
## 1 0.985
Let’s look at the margins of error in relation to the counts of people in each race/ethnicity category in each county. Observations to the upper left of the black line have coefficients of variation in excess of 0.4.
tracts %>%
ggplot(aes(poverty_black, poverty_black_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most Black Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
tracts %>%
ggplot(aes(poverty_hispanic, poverty_hispanic_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most Hispanic Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
tracts %>%
ggplot(aes(poverty_other_races, poverty_other_races_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most Other Races and Ethnicities Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
tracts %>%
ggplot(aes(poverty_white_nonhispanic, poverty_white_nonhispanic_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most White, non-Hispanic Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
As mentioned earlier, these race/ethnicity groups are not disjoint. Accordingly, summing the groups will result in population counts that exceed the population. It will also result in poverty counts that are inflated.
tracts %>%
mutate(poverty_summed = poverty_black + poverty_hispanic + poverty_other_races + poverty_white_nonhispanic) %>%
ggplot(aes(poverty, poverty_summed)) +
geom_point(alpha = 0.2, size = 1) +
coord_equal() +
labs(title = "The Counts Are Unequal because the Groups Aren't Disjoint") +
scatter_grid()
We turn the count of people in poverty into a rate.
tracts <- tracts %>%
mutate(poverty_rate = poverty / people)
stopifnot(min(tracts$poverty_rate) >= 0)
stopifnot(max(tracts$poverty_rate) <= 1)
We calculate the rate of poverty in high-poverty tracts.
tracts <- tracts %>%
mutate(
high_poverty = if_else(poverty_rate > 0.4, poverty, 0),
high_poverty_black = if_else(poverty_rate > 0.4, poverty_black, 0),
high_poverty_hispanic = if_else(poverty_rate > 0.4, poverty_hispanic, 0),
high_poverty_other_races = if_else(poverty_rate > 0.4, poverty_other_races, 0),
high_poverty_white_nonhispanic = if_else(poverty_rate > 0.4, poverty_white_nonhispanic, 0)
)
We calculate the overall poverty and the number of people without a poverty estimate and then sum to the county level.
counties_summary <- tracts %>%
group_by(state, county) %>%
summarize(
people = sum(people),
tracts = n(),
# poverty
poverty = sum(poverty),
poverty_black = sum(poverty_black),
poverty_hispanic = sum(poverty_hispanic),
poverty_other_races = sum(poverty_other_races),
poverty_white_nonhispanic = sum(poverty_white_nonhispanic),
# high poverty
high_poverty = sum(high_poverty),
high_poverty_black = sum(high_poverty_black),
high_poverty_hispanic = sum(high_poverty_hispanic),
high_poverty_other_races = sum(high_poverty_other_races),
high_poverty_white_nonhispanic = sum(high_poverty_white_nonhispanic),
# MOE
poverty_se = sqrt(sum(poverty_moe ^ 2)) / 1.645,
poverty_black_se = sqrt(sum(poverty_black_moe ^ 2)) / 1.645,
poverty_hispanic_se = sqrt(sum(poverty_hispanic_moe ^ 2)) / 1.645,
poverty_other_races_se = sqrt(sum(poverty_other_races_moe ^ 2)) / 1.645,
poverty_white_nonhispanic_se = sqrt(sum(poverty_white_nonhispanic_moe ^ 2)) / 1.645
) %>%
ungroup()
counties_summary <- counties_summary %>%
mutate(poverty_rate = poverty / people)
We pull in the county-level data and compare it to the calculated county-level data. The poverty rates should be identical; however, they may differ from numbers published elsewhere (like here) that use Small-Area Income and Poverty Estimates (SAIPE).
counties_test <- map_df(state_fips, ~getCensus(name = "acs/acs5",
vars = vars,
region = "county:*",
regionin = .x,
vintage = 2021)) %>%
as_tibble()
counties_test <- counties_test %>%
mutate(poverty_rate_test = B17001_002E / B17001_001E) %>%
select(state, county, poverty_rate_test) %>%
arrange(state, county)
stopifnot(
counties_summary %>%
select(state, county, poverty_rate) %>%
left_join(counties_test, by = c("state", "county")) %>%
filter(poverty_rate != poverty_rate_test) %>%
nrow() == 0
)
rm(counties_test)
We need the conditional logic to deal with division by zero. If there is no poverty then poverty exposure is zero.
counties_summary <- counties_summary %>%
mutate(
poverty_exposure = high_poverty / poverty,
poverty_exposure_black =
if_else(condition = poverty_black > 0,
true = high_poverty_black / poverty_black,
false = 0),
poverty_exposure_hispanic =
if_else(condition = poverty_hispanic > 0,
true = high_poverty_hispanic / poverty_hispanic,
false = 0),
poverty_exposure_other_races =
if_else(condition = poverty_other_races > 0,
true = high_poverty_other_races / poverty_other_races,
false = 0),
poverty_exposure_white_nonhispanic =
if_else(condition = poverty_white_nonhispanic > 0,
true = high_poverty_white_nonhispanic / poverty_white_nonhispanic,
false = 0),
)
stopifnot(
all(map_dbl(counties_summary, ~sum(is.na(.x))) == 0)
)
Let’s look at the highest values.
counties_summary %>%
arrange(desc(poverty_exposure)) %>%
select(state, county, poverty_exposure)
## # A tibble: 3,143 × 3
## state county poverty_exposure
## <chr> <chr> <dbl>
## 1 46 031 1
## 2 46 095 1
## 3 46 121 1
## 4 46 137 1
## 5 48 127 0.863
## 6 46 085 0.861
## 7 22 035 0.858
## 8 46 071 0.848
## 9 48 377 0.784
## 10 51 750 0.740
## # … with 3,133 more rows
There shouldn’t be any missing values.
stopifnot(
counties_summary %>%
filter(is.na(poverty_exposure)) %>%
nrow() == 0
)
The table shows the calculated metrics. Click on the variable columns to sort the table.
counties_summary %>%
ggplot(aes(poverty_exposure)) +
geom_histogram() +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
labs(title = "Most Counties in 2021 Have No Poverty Exposure",
subtitle = "The Distribution of Poverty Exposure")
counties_summary %>%
ggplot(aes(tracts, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Number of Tracts in County")
counties_summary %>%
ggplot(aes(people, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Population in County")
counties_summary %>%
ggplot(aes(poverty_rate, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
scatter_grid() +
labs(title = "County Poverty Rate and County Poverty Exposure Are Related")
counties_summary_subgroups_plots <- counties_summary %>%
select(state, county, contains("exposure")) %>%
# pivot to very long
pivot_longer(c(-state, -county), names_to = "subgroup", values_to = "poverty_exposure") %>%
# clean up names
mutate(subgroup =
recode(
subgroup,
poverty_exposure = "All",
poverty_exposure_black = "Black",
poverty_exposure_hispanic = "Hispanic",
poverty_exposure_other_races = "Other Races and Ethnicities",
poverty_exposure_white_nonhispanic = "White, Non-Hispanic"
)
)
counties_summary_subgroups_plots %>%
filter(subgroup != "All") %>%
ggplot(aes(poverty_exposure)) +
geom_histogram() +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup) +
labs(title = "Most Counties in 2021 Have No Poverty Exposure",
subtitle = "The Distribution of Poverty Exposure")
counties_summary_subgroups_plots <- left_join(counties_summary_subgroups_plots, select(counties_summary, -poverty_exposure), by = c("state", "county"))
counties_summary_subgroups_plots %>%
filter(subgroup!= "All") %>%
ggplot(aes(tracts, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup, nrow = 2) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Number of Tracts in County")
counties_summary_subgroups_plots %>%
filter(subgroup!= "All") %>%
ggplot(aes(people, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Population in County")
counties_summary_subgroups_plots %>%
filter(subgroup!= "All") %>%
ggplot(aes(poverty_rate, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup) +
scatter_grid() +
labs(title = "County Poverty Rate and County Poverty Exposure Are Related")
rm(counties_summary_subgroups_plots)
#' Suppress counties
#'
#' @param race The variable for the count in a race/ethnicity group
#' @param exposure The variable name for the exposure index
#' @param threshold The minimum size of the race group to report the exposure index
#'
#' @return
#'
suppress_county <- function(race, exposure, threshold) {
exposure <- if_else(race <= threshold, as.numeric(NA), exposure)
return(exposure)
}
counties_summary %>%
summarize(
all = sum(is.na(poverty_exposure)),
black_nh = sum(is.na(poverty_exposure_black)),
hispanic = sum(is.na(poverty_exposure_hispanic)),
other_nh = sum(is.na(poverty_exposure_other_races)),
white_nh = sum(is.na(poverty_exposure_white_nonhispanic))
)
## # A tibble: 1 × 5
## all black_nh hispanic other_nh white_nh
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 0
counties_summary <- counties_summary %>%
mutate(
# overall
poverty_exposure =
suppress_county(
race = poverty,
exposure = poverty_exposure,
threshold = 30
),
# black
poverty_exposure_black =
suppress_county(
race = poverty_black,
exposure = poverty_exposure_black,
threshold = 30
),
# hispanic
poverty_exposure_hispanic =
suppress_county(
race = poverty_hispanic,
exposure = poverty_exposure_hispanic,
threshold = 30
),
# other races and ethnicities
poverty_exposure_other_races =
suppress_county(
race = poverty_other_races,
exposure = poverty_exposure_other_races,
threshold = 30
),
# white, non-hispanic
poverty_exposure_white_nonhispanic =
suppress_county(
race = poverty_white_nonhispanic,
exposure = poverty_exposure_white_nonhispanic,
threshold = 30
)
)
counties_summary %>%
summarize(
all = sum(is.na(poverty_exposure)),
black_nh = sum(is.na(poverty_exposure_black)),
hispanic = sum(is.na(poverty_exposure_hispanic)),
other_nh = sum(is.na(poverty_exposure_other_races)),
white_nh = sum(is.na(poverty_exposure_white_nonhispanic))
)
## # A tibble: 1 × 5
## all black_nh hispanic other_nh white_nh
## <int> <int> <int> <int> <int>
## 1 7 1053 580 323 28
We need to add data quality flags with 1,
2, or 3. The values are outlined in the data
standards.
1 - If the county coefficient of variation for the
count in the group is less than 0.22 - If the county coefficient of variation for the
count in the group is less than 0.43 - If the county coefficient of variation for the
count in the group exceeds 0.4NA - If the metric is missingcounties_summary <- counties_summary %>%
mutate(
poverty_cv = poverty_se / poverty,
poverty_black_cv = poverty_black_se / poverty_black,
poverty_hispanic_cv = poverty_hispanic_se / poverty_hispanic,
poverty_other_races_cv = poverty_other_races_se / poverty_other_races,
poverty_white_nonhispanic_cv = poverty_white_nonhispanic_se / poverty_white_nonhispanic
)
counties_summary %>%
filter(poverty_cv >= 0.4) %>%
ggplot(aes(poverty, poverty_cv, color = poverty <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_black_cv >= 0.4) %>%
ggplot(aes(poverty_black, poverty_black_cv, color = poverty_black <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "Black: The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty_black <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_hispanic_cv >= 0.4) %>%
ggplot(aes(poverty_hispanic, poverty_hispanic_cv, color = poverty_hispanic <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "Hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty_hispanic <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_other_races_cv >= 0.4) %>%
ggplot(aes(poverty_other_races, poverty_other_races_cv, color = poverty_other_races <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "Other Races and Ethnicities: The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty_other_races <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_white_nonhispanic_cv >= 0.4) %>%
ggplot(aes(poverty_white_nonhispanic, poverty_white_nonhispanic_cv, color = poverty_white_nonhispanic <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "White, non_hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "white_nh <= 30 in yellow") +
scatter_grid()
#' Assign a data quality flag
#'
#' @param race A vector of counts of a race/ethnicity group within a county
#' @param exposure A race/ethnicity exposure metric
#'
#' @return A numeric data quality flag
#'
set_quality <- function(cv, exposure) {
quality <- case_when(
cv < 0.2 ~ 1,
cv < 0.4 ~ 2,
cv >= 0.4 ~ 3
)
quality <- if_else(is.na(exposure), as.numeric(NA), quality)
return(quality)
}
counties_summary <- counties_summary %>%
mutate(
poverty_exposure_quality = set_quality(cv = poverty_cv, exposure = poverty_exposure),
poverty_exposure_black_quality = set_quality(cv = poverty_black_cv, exposure = poverty_exposure_black),
poverty_exposure_hispanic_quality = set_quality(cv = poverty_hispanic_cv, exposure = poverty_exposure_hispanic),
poverty_exposure_other_races_quality = set_quality(cv = poverty_other_races_cv, exposure = poverty_exposure_other_races),
poverty_exposure_white_nonhispanic_quality = set_quality(cv = poverty_white_nonhispanic_cv, exposure = poverty_exposure_white_nonhispanic)
)
count(counties_summary, poverty_exposure_quality)
## # A tibble: 4 × 2
## poverty_exposure_quality n
## <dbl> <int>
## 1 1 2826
## 2 2 294
## 3 3 16
## 4 NA 7
count(counties_summary, poverty_exposure_black_quality)
## # A tibble: 4 × 2
## poverty_exposure_black_quality n
## <dbl> <int>
## 1 1 772
## 2 2 613
## 3 3 705
## 4 NA 1053
count(counties_summary, poverty_exposure_hispanic_quality)
## # A tibble: 4 × 2
## poverty_exposure_hispanic_quality n
## <dbl> <int>
## 1 1 611
## 2 2 1040
## 3 3 912
## 4 NA 580
count(counties_summary, poverty_exposure_other_races_quality)
## # A tibble: 4 × 2
## poverty_exposure_other_races_quality n
## <dbl> <int>
## 1 1 316
## 2 2 578
## 3 3 1926
## 4 NA 323
count(counties_summary, poverty_exposure_white_nonhispanic_quality)
## # A tibble: 4 × 2
## poverty_exposure_white_nonhispanic_quality n
## <dbl> <int>
## 1 1 2538
## 2 2 515
## 3 3 62
## 4 NA 28
Most of the counties with missing values are very small.
missing <- counties_summary %>%
filter(
is.na(poverty_exposure) |
is.na(poverty_exposure_black) |
is.na(poverty_exposure_hispanic) |
is.na(poverty_exposure_other_races) |
is.na(poverty_exposure_white_nonhispanic)
)
max(missing$people)
## [1] 124678
max(missing$tracts)
## [1] 28
We need to include all counties in the published data even if we don’t have a metric for the county. We load the county file and join our metrics to the county file.
# load the 2020 county file
all_counties <- read_csv(here::here("geographic-crosswalks", "data", "county-populations.csv")) %>%
filter(year == 2020) %>%
mutate(year = 2021)
final_data <- left_join(all_counties, counties_summary, by = c("state", "county")) %>%
select(year,
state,
county,
poverty_exposure,
poverty_exposure_quality)
stopifnot(nrow(final_data) == 3143)
# This was 3142 for 2018, but we did not remove Rio Arriba County, NM this time because there was not a data collection error that impacted the 2017-2021 5-year file
write_csv(final_data,
here::here("06_neighborhoods", "poverty-exposure", "poverty-exposure_2020.csv"))
# create a long version of the subgroup data
counties_summary_subgroups <- counties_summary %>%
select(state, county, contains("exposure")) %>%
# pivot to very long
pivot_longer(c(-state, -county), names_to = "subgroup", values_to = "poverty_exposure") %>%
# create new variable names
mutate(variable = if_else(str_detect(subgroup, "_quality"),
"poverty_exposure_quality",
"poverty_exposure")) %>%
mutate(subgroup = str_remove(subgroup, "_quality")) %>%
# pivot to long
pivot_wider(names_from = variable, values_from = poverty_exposure) %>%
# clean up names
mutate(subgroup =
recode(
subgroup,
poverty_exposure = "All",
poverty_exposure_black = "Black",
poverty_exposure_hispanic = "Hispanic",
poverty_exposure_other_races = "Other Races and Ethnicities",
poverty_exposure_white_nonhispanic = "White, Non-Hispanic"
)
)
# check the bounds of the poverty exposure metric
stopifnot(min(counties_summary_subgroups$poverty_exposure, na.rm = TRUE) >= 0)
stopifnot(max(counties_summary_subgroups$poverty_exposure, na.rm = TRUE) <= 1)
counties_summary_subgroups <- counties_summary_subgroups %>%
mutate(subgroup_type = if_else(subgroup == "All", "all", "race-ethnicity"))
# create a frame with all possible rows
all_counties_subgroups <-
expand_grid(
all_counties,
subgroup = c("All", "Black", "Hispanic", "Other Races and Ethnicities", "White, Non-Hispanic")
) %>%
mutate(subgroup_type = if_else(subgroup == "All", "all", "race-ethnicity"))
final_data_race_ethnicity <- left_join(all_counties_subgroups,
counties_summary_subgroups,
by = c("state", "county", "subgroup_type", "subgroup")) %>%
select(year,
state,
county,
subgroup_type,
subgroup,
poverty_exposure,
poverty_exposure_quality)
stopifnot(nrow(final_data_race_ethnicity) == 15715)
write_csv(final_data_race_ethnicity,
here::here("06_neighborhoods", "poverty-exposure", "poverty-exposure_race-ethnicity_2020.csv"))